Machine learning project

Authors: Jose Pérez Cano & Álvaro Ribot Barrado

0. Libraries

install.packages("klaR")
install.packages("TunePareto")
install.packages("rgl")
install.packages("glmnet")
install.packages("ca")
# LDA/ QDA
library(MASS)

# RDA
library(klaR)

# Multinomial
library(nnet)

# Cross-Validation
library(TunePareto)

# Naive Bayes
library(e1071)

# k-NN
library(class)

# Correspondence analysis
library(ca)

# Cross-validation nn
library(caret)
Loading required package: lattice
Loading required package: ggplot2

1. Read data

set.seed(2105)
setwd("../data")
The working directory was changed to /Users/joseperezcano/Desktop/CFIS segundo curso/AA1/Project/data inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
clev <- read.csv("cleveland.csv", header=F)
head(clev)

2. Preprocess data

The dataset has missings that need to be treated, columns with too many should be removed and columns with a few should have them imputated.

source("Preprocessing.R")

# Missings
clev <- clev[,much.na.cols(clev,60)]
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28", "location")
clev <- remove.var(clev, dummy)
clev <- knn.imputation(clev, 7)

# Multicollinearity
#corr.factors <- cor(clev)
#which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
clev <- remove.var(clev, c("V57", "V55"))

# Factors
factores <- c("V58", "V4", "V9", "V16", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V41", "V51", "V56", "V11", "V59", "V60", "V61", "V63", "V65", "V67", "V68")
for (f in factores){
  clev[,f] <- as.factor(clev[,f])
}
clev <- move.value(clev, "V25", 2, 1)
summary(clev)
       V3        V4      V9           V10        V11          V12       
 Min.   :29.00   0: 91   1: 22   Min.   : 94.0   0:108   Min.   :126.0  
 1st Qu.:48.00   1:191   2: 43   1st Qu.:120.0   1:174   1st Qu.:213.0  
 Median :55.00           3: 84   Median :130.0           Median :244.0  
 Mean   :54.41           4:133   Mean   :131.6           Mean   :249.1  
 3rd Qu.:61.00                   3rd Qu.:140.0           3rd Qu.:277.0  
 Max.   :77.00                   Max.   :200.0           Max.   :564.0  
                                                                        
      V14             V15        V16     V18     V19          V20    
 Min.   : 0.00   Min.   : 0.00   0:240   0:107   0:138   1      :38  
 1st Qu.: 0.00   1st Qu.: 0.00   1: 42   1:175   1:  2   12     :33  
 Median :10.00   Median :15.00                   2:142   2      :31  
 Mean   :16.64   Mean   :15.08                           8      :30  
 3rd Qu.:30.00   3rd Qu.:30.00                           6      :26  
 Max.   :99.00   Max.   :54.00                           11     :25  
                                                         (Other):99  
      V21      V22     V23     V24     V25     V26     V27          V29        
 4      : 13   81:67   0:271   0:186   0:211   0:252   0:248   Min.   : 1.800  
 15     : 13   82:96   1: 11   1: 95   1: 71   1: 30   1: 34   1st Qu.: 6.500  
 20     : 13   83:86           2:  1                           Median : 8.500  
 13     : 12   84:33                                           Mean   : 8.418  
 16     : 12                                                   3rd Qu.:10.075  
 21     : 12                                                   Max.   :15.000  
 (Other):207                                                                   
      V31              V32             V33              V34       
 Min.   : 3.000   Min.   : 71.0   Min.   : 40.00   Min.   : 84.0  
 1st Qu.: 7.000   1st Qu.:133.2   1st Qu.: 65.00   1st Qu.:154.0  
 Median : 9.000   Median :153.5   Median : 74.00   Median :168.0  
 Mean   : 9.754   Mean   :149.8   Mean   : 75.12   Mean   :168.1  
 3rd Qu.:12.000   3rd Qu.:165.8   3rd Qu.: 84.00   3rd Qu.:184.0  
 Max.   :18.000   Max.   :202.0   Max.   :119.00   Max.   :232.0  
                                                                  
      V35              V37         V38     V39          V40        V41    
 Min.   : 26.00   Min.   : 50.00   0:190   0:276   Min.   :0.000   1:135  
 1st Qu.: 70.00   1st Qu.: 80.00   1: 92   1:  6   1st Qu.:0.000   2:129  
 Median : 80.00   Median : 85.00                   Median :0.800   3: 18  
 Mean   : 78.74   Mean   : 84.95                   Mean   :1.027          
 3rd Qu.: 85.00   3rd Qu.: 90.00                   3rd Qu.:1.600          
 Max.   :120.00   Max.   :110.00                   Max.   :6.200          
                                                                          
      V43             V44         V51          V56      V58     V59     V60    
 Min.   : 24.0   Min.   :0.0000   1:  2   5      : 14   0:157   1:270   1:242  
 1st Qu.: 92.0   1st Qu.:0.0000   3:159   21     : 14   1: 50   2: 12   2: 40  
 Median :118.0   Median :0.0000   6: 14   14     : 12   2: 31                  
 Mean   :123.6   Mean   :0.6702   7:107   1      : 11   3: 32                  
 3rd Qu.:152.8   3rd Qu.:1.0000           17     : 11   4: 12                  
 Max.   :270.0   Max.   :3.0000           30     : 11                          
                                          (Other):209                          
 V61     V63     V65     V67     V68    
 1:224   1:238   1:236   1:233   1:246  
 2: 58   2: 44   2: 46   2: 49   2: 36  
                                        
                                        
                                        
                                        
                                        

2.1 Visualizations

source("Visualizations.R")

histograms(clev)

boxplot.num(clev)

histograms(clev, F)

show.cor(clev)
    row col
V34  10   2
V37  12   2
V15   5   4
V14   4   5
V31   7   6
V29   6   7
V32   8   7
V31   7   8
V10   2  10
V37  12  11
V10   2  12
V35  11  12
     row col

2.2 Modification of values

Boxcox

Variables with many zeros:

This is the final dataset.

summary(clev)
        V3.V1         V4      V9            V10.V1        V11    
 Min.   :-2.8069276   0: 91   1: 22   Min.   :-3.901751   0:108  
 1st Qu.:-0.7081950   1:191   2: 43   1st Qu.:-0.551231   1:174  
 Median : 0.0650223           3: 84   Median : 0.040911          
 Mean   : 0.0000000           4:133   Mean   : 0.000000          
 3rd Qu.: 0.7277800                   3rd Qu.: 0.555074          
 Max.   : 2.4951338                   Max.   : 2.864084          
                                                                 
       V12.V1               V14.V1               V15.V1        V16     V18    
 Min.   :-4.643087   Min.   :-1.0513034   Min.   :-0.9838265   0:240   0:107  
 1st Qu.:-0.650497   1st Qu.:-1.0513034   1st Qu.:-0.9838265   1: 42   1:175  
 Median : 0.006803   Median : 0.0726030   Median :-0.0053205                  
 Mean   : 0.000000   Mean   : 0.0000000   Mean   : 0.0000000                  
 3rd Qu.: 0.617022   3rd Qu.: 0.8953597   3rd Qu.: 0.9731855                  
 Max.   : 4.315880   Max.   : 2.4849857   Max.   : 2.5387951                  
                                                                              
 V19          V20          V21      V22     V23     V24     V25     V26    
 0:138   1      :38   4      : 13   81:67   0:271   0:186   0:211   0:252  
 1:  2   12     :33   15     : 13   82:96   1: 11   1: 95   1: 71   1: 30  
 2:142   2      :31   20     : 13   83:86           2:  1                  
         8      :30   13     : 12   84:33                                  
         6      :26   16     : 12                                          
         11     :25   21     : 12                                          
         (Other):99   (Other):207                                          
 V27            V29.V1              V31.V1              V32.V1       
 0:248   Min.   :-2.5548211   Min.   :-4.123390   Min.   :-3.435980  
 1: 34   1st Qu.:-0.7405477   1st Qu.:-0.858782   1st Qu.:-0.720470  
         Median : 0.0314835   Median :-0.124711   Median : 0.162889  
         Mean   : 0.0000000   Mean   : 0.000000   Mean   : 0.000000  
         3rd Qu.: 0.6394581   3rd Qu.: 0.774339   3rd Qu.: 0.697266  
         Max.   : 2.5405850   Max.   : 2.200026   Max.   : 2.278588  
                                                                     
       V33.V1              V34.V1              V35.V1              V37.V1       
 Min.   :-2.545485   Min.   :-3.565358   Min.   :-3.956482   Min.   :-3.687203  
 1st Qu.:-0.733340   1st Qu.:-0.596232   1st Qu.:-0.655956   1st Qu.:-0.521933  
 Median :-0.080968   Median :-0.002407   Median : 0.094164   Median : 0.005612  
 Mean   : 0.000000   Mean   : 0.000000   Mean   : 0.000000   Mean   : 0.000000  
 3rd Qu.: 0.643890   3rd Qu.: 0.676251   3rd Qu.: 0.469224   3rd Qu.: 0.533157  
 Max.   : 3.180892   Max.   : 2.712223   Max.   : 3.094643   Max.   : 2.643337  
                                                                                
 V38     V39            V40.V1        V41           V43.V1       
 0:190   0:276   Min.   :-1.1986205   1:135   Min.   :-3.732861  
 1: 92   1:  6   1st Qu.:-1.1986205   2:129   1st Qu.:-0.539950  
                 Median : 0.1781269   3: 18   Median : 0.021224  
                 Mean   : 0.0000000           Mean   : 0.000000  
                 3rd Qu.: 0.7483943           3rd Qu.: 0.660731  
                 Max.   : 2.6340820           Max.   : 2.340441  
                                                                 
        V44.V1        V51          V56      V58     V59     V60     V61    
 Min.   :-0.7158259   1:  2   5      : 14   0:157   1:270   1:242   1:224  
 1st Qu.:-0.7158259   3:159   21     : 14   1: 50   2: 12   2: 40   2: 58  
 Median :-0.7158259   6: 14   14     : 12   2: 31                          
 Mean   : 0.0000000   7:107   1      : 11   3: 32                          
 3rd Qu.: 0.3522318           17     : 11   4: 12                          
 Max.   : 2.4883471           30     : 11                                  
                              (Other):209                                  
 V63     V65     V67     V68    
 1:238   1:236   1:233   1:246  
 2: 44   2: 46   2: 49   2: 36  
                                
                                
                                
                                
                                

2.3. Feature extraction

Separe train and test data, seed for reproducibility.

set.seed(2000)
n <- nrow(clev)
train.lenght <- round(2*n/3)

clev <- clev[sample(n),]
train <- clev[1:train.lenght,]
test <- clev[(train.lenght+1):n,]

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"
pca <- pca.num(train)
plot.pca(train, col.class, pca = pca)

fda <- plot.fda(train, V58~.-V21-V22-V59, col.class)


train <- extract.fda(fda, train)
test <- extract.fda(fda, test)

Correspondence analysis

3. Resampling protocol

source("Resampling.R")

4. Models

The models we are going to use are: - LDA - QDA - RDA - k-NN - Naïve Bayes - GLM - Neural Networks

rda.model <- rda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
naive.model <- naiveBayes(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
cross.validation(train, train$V58, rda.model, 10, 10, T)
rda.model.fda <- rda(V58~.,data=train)
cross.validation(train, train$V58, rda.model.fda, 10, 10, T)
cross.validation.naive(train, train$V58, naive.model, 10, 10)
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
plot(err, type = "l")
err
cross.validation.knn(train, train$V58, 10, 10, 1)

multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)

Test error

rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
(err.test <- 1-sum(diag(err.table))/sum(err.table))

Hungarian data

hung <- read.csv("../data/hungarian.csv", header=F)
head(hung)
# Missings
hung <- hung[,much.na.cols(hung,60)]
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28")
hung <- remove.var(hung, dummy)
hung <- knn.imputation(hung, 7)

# Multicollinearity
corr.factors <- cor(hung)
which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
    row col
V55  32  12
V57  34  14
V43  31  30
V42  30  31
V20  12  32
V22  14  34
hung <- remove.var(hung, c("V57", "V55"))

# Factors
factores <- c("V58", "V4", "V9", "V16", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V56", "V11")
for (f in factores){
  hung[,f] <- as.factor(hung[,f])
}
hung <- move.value(hung, "V25", 2, 1)
summary(hung)
       V3        V4            V5               V6               V7        
 Min.   :28.00   0: 81   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:42.00   1:213   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :49.00           Median :1.0000   Median :0.0000   Median :1.0000  
 Mean   :47.83           Mean   :0.9218   Mean   :0.4422   Mean   :0.5204  
 3rd Qu.:54.00           3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :66.00           Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
                                                                           
 V9           V10        V11          V12        V16     V19          V20    
 1: 11   Min.   : 17.0   0:195   Min.   :  8.0   0:266   0:235   3      :48  
 2:106   1st Qu.:120.0   1: 99   1st Qu.:198.0   1: 28   1: 53   5      :36  
 3: 54   Median :130.0           Median :237.0           2:  6   4      :34  
 4:123   Mean   :132.2           Mean   :236.8                   7      :29  
         3rd Qu.:140.0           3rd Qu.:277.0                   11     :27  
         Max.   :200.0           Max.   :603.0                   6      :23  
                                                                 (Other):97  
      V21      V22     V23     V24     V25     V26     V27          V29       
 16     : 20   83:21   0:293   0:274   0:265   0:269   0:290   Min.   : 1.00  
 2      : 14   84:62   1:  1   1: 20   1: 29   1: 25   1:  4   1st Qu.: 7.00  
 3      : 13   85:84                                           Median :10.00  
 9      : 13   86:91                                           Mean   :10.62  
 12     : 11   87:36                                           3rd Qu.:13.00  
 21     : 11                                                   Max.   :24.00  
 (Other):212                                                                  
      V31              V32             V33              V34       
 Min.   : 2.000   Min.   : 47.0   Min.   : 27.00   Min.   : 17.0  
 1st Qu.: 5.000   1st Qu.:122.0   1st Qu.: 70.00   1st Qu.:160.0  
 Median : 6.000   Median :140.0   Median : 80.00   Median :180.0  
 Mean   : 5.699   Mean   :138.8   Mean   : 80.85   Mean   :180.7  
 3rd Qu.: 7.000   3rd Qu.:155.0   3rd Qu.: 92.00   3rd Qu.:200.0  
 Max.   :11.000   Max.   :190.0   Max.   :134.00   Max.   :240.0  
                                                                  
      V35             V37         V38     V39          V40        
 Min.   :  5.0   Min.   :  7.00   0:204   0:291   Min.   :0.0000  
 1st Qu.: 90.0   1st Qu.: 80.00   1: 90   1:  3   1st Qu.:0.0000  
 Median : 95.0   Median : 80.00                   Median :0.0000  
 Mean   : 95.4   Mean   : 84.13                   Mean   :0.5861  
 3rd Qu.:100.0   3rd Qu.: 90.00                   3rd Qu.:1.0000  
 Max.   :134.0   Max.   :110.00                   Max.   :5.0000  
                                                                  
      V42             V43            V56      V58    
 Min.   : 3.00   Min.   : 2.0   20     : 33   0:188  
 1st Qu.:10.00   1st Qu.: 9.0   30     : 32   1: 37  
 Median :13.00   Median :13.0   10     : 25   2: 26  
 Mean   :13.69   Mean   :13.3   16     : 16   3: 28  
 3rd Qu.:17.00   3rd Qu.:17.0   25     : 16   4: 15  
 Max.   :31.00   Max.   :30.0   15     : 15          
                                (Other):157          
histograms(hung)

boxplot.num(hung)

histograms(hung, F)

show.cor(hung)
    row col
V7    4   3
V6    3   4
V37  13   5
V31   8   7
V29   7   8
V35  12  11
V34  11  12
V37  13  12
V10   5  13
V35  12  13
V43  16  15
V42  15  16
     row col

hung <- apply.trans(hung, sqrt.neg.vars=c("V10", "V12"), sqrt.vars = c("V31", "V29", "V42", "V43"), log.vars = c("V6", "V7", "V40"))
hung <- scale.num(hung)
hung <- remove.var(hung, c("V23", "V39"))
hung <- move.value(hung, "V19", 2, 1)
summary(hung)
        V3.V1         V4             V5.V1               V6.V1        
 Min.   :-2.5380193   0: 81   Min.   :-3.426738   Min.   :-0.8888120  
 1st Qu.:-0.7458616   1:213   1st Qu.: 0.290830   1st Qu.:-0.8888120  
 Median : 0.1502173           Median : 0.290830   Median :-0.8888120  
 Mean   : 0.0000000           Mean   : 0.000000   Mean   : 0.0000000  
 3rd Qu.: 0.7902736           3rd Qu.: 0.290830   3rd Qu.: 1.1212705  
 Max.   : 2.3264088           Max.   : 0.290830   Max.   : 1.1212705  
                                                                      
        V7.V1         V9             V10.V1        V11           V12.V1       
 Min.   :-1.0399113   1: 11   Min.   :-10.485789   0:195   Min.   :-4.891897  
 1st Qu.:-1.0399113   2:106   1st Qu.: -0.525584   1: 99   1st Qu.:-0.341989  
 Median : 0.9583496   3: 54   Median : -0.053231           Median : 0.103229  
 Mean   : 0.0000000   4:123   Mean   :  0.000000           Mean   : 0.000000  
 3rd Qu.: 0.9583496           3rd Qu.:  0.398645           3rd Qu.: 0.521963  
 Max.   : 0.9583496           Max.   :  2.790781           Max.   : 3.159997  
                                                                              
 V16     V19          V20          V21      V22     V24     V25     V26    
 0:266   0:235   3      :48   16     : 20   83:21   0:274   0:265   0:269  
 1: 28   1: 59   5      :36   2      : 14   84:62   1: 20   1: 29   1: 25  
                 4      :34   3      : 13   85:84                          
                 7      :29   9      : 13   86:91                          
                 11     :27   12     : 11   87:36                          
                 6      :23   21     : 11                                  
                 (Other):97   (Other):212                                  
 V27            V29.V1               V31.V1              V32.V1       
 0:290   Min.   :-2.8823019   Min.   :-2.3234920   Min.   :-3.801186  
 1:  4   1st Qu.:-0.6976042   1st Qu.:-0.2892675   1st Qu.:-0.696194  
         Median :-0.0119271   Median : 0.2389864   Median : 0.049004  
         Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.000000  
         3rd Qu.: 0.5765086   3rd Qu.: 0.7247660   3rd Qu.: 0.670002  
         Max.   : 2.2935054   Max.   : 2.3852879   Max.   : 2.118998  
                                                                      
       V33.V1              V34.V1              V35.V1              V37.V1       
 Min.   :-3.402790   Min.   :-5.995364   Min.   :-6.729394   Min.   :-7.492772  
 1st Qu.:-0.685630   1st Qu.:-0.758793   1st Qu.:-0.402310   1st Qu.:-0.401452  
 Median :-0.053733   Median :-0.026406   Median :-0.030129   Median :-0.401452  
 Mean   : 0.000000   Mean   : 0.000000   Mean   : 0.000000   Mean   : 0.000000  
 3rd Qu.: 0.704544   3rd Qu.: 0.705982   3rd Qu.: 0.342052   3rd Qu.: 0.569962  
 Max.   : 3.358514   Max.   : 2.170757   Max.   : 2.872886   Max.   : 2.512790  
                                                                                
 V38            V40.V1               V42.V1               V43.V1       
 0:204   Min.   :-0.7022515   Min.   :-2.4537060   Min.   :-2.8735946  
 1: 90   1st Qu.:-0.7022515   1st Qu.:-0.5948207   1st Qu.:-0.7585268  
         Median :-0.7022515   Median :-0.0186920   Median : 0.0491369  
         Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
         3rd Qu.: 0.7554881   3rd Qu.: 0.6539804   3rd Qu.: 0.7394332  
         Max.   : 3.0659508   Max.   : 2.5316230   Max.   : 2.5455122  
                                                                       
      V56      V58    
 20     : 33   0:188  
 30     : 32   1: 37  
 10     : 25   2: 26  
 16     : 16   3: 28  
 25     : 16   4: 15  
 15     : 15          
 (Other):157          
set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)

hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"
col.class2 <- as.numeric(train$V58)
col.class2[col.class2==1] <- "red"
col.class2[col.class2==2] <- "green"
col.class2[col.class2==3] <- "blue"
col.class2[col.class2==4] <- "yellow"
col.class2[col.class2==5] <- "purple"
pca <- pca.num(hung)
plot.pca(hung, col.class2, pca = pca)

hung <- extract.pca(pca, hung)
fda <- plot.fda(train, V58~.-V56, col.class)
variables are collinear

train <- extract.fda(fda, train)
test <- extract.fda(fda, test)
plot(test$LD1, test$LD2, col=c("red","green","yellow","blue","purple")[as.numeric(test$V58)], xlab="LD1", ylab="LD2")
legend("topleft", legend=c("0","1","2","3","4"), fill=c("red","green","yellow","blue","purple"))

rda.model <- rda(V58~.-LD1-LD2-LD3-LD4, data=train)
naive.model <- naiveBayes(V58~., data=train)
cross.validation(train, train$V58, rda.model, 10, 10, T)
rda.model.fda <- rda(V58~.,data=train)
cross.validation(train, train$V58, rda.model.fda, 2, 10, T)
Error in cross.validation(train, train$V58, rda.model.fda, 2, 10, T) : 
  no se pudo encontrar la función "cross.validation"
cross.validation.naive(train, train$V58, naive.model, 10, 10)
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 1, mean error: 0.306052631578947"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 2, mean error: 0.311315789473684"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 3, mean error: 0.305263157894737"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 4, mean error: 0.325526315789474"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 5, mean error: 0.30578947368421"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 6, mean error: 0.333157894736842"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 7, mean error: 0.321052631578947"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 8, mean error: 0.326578947368421"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 9, mean error: 0.317368421052632"
[1] "Fold: 1"
[1] "Fold: 2"
[1] "Fold: 3"
[1] "Fold: 4"
[1] "Fold: 5"
[1] "Fold: 6"
[1] "Fold: 7"
[1] "Fold: 8"
[1] "Fold: 9"
[1] "Fold: 10"
[1] "Iteration 10, mean error: 0.32"
[1] 0.3172105
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
plot(err, type = "l")

err
 [1] 0.3358684 0.3461842 0.2990263 0.2943158 0.3140526 0.3193947 0.3229737
 [8] 0.3300526 0.3405526 0.3535000 0.3626053 0.3702368 0.3702368 0.3772895
[15] 0.3814474 0.3809474 0.3839737 0.3849737 0.3844737 0.3865000
multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)
qda.model <- qda(V58~.-LD1-LD2-LD3-LD4, data=train)

Redes neuronales

decays <- c(0.0001, 0.001, 0.01, 0.1, 1)
nn.model10x10CV <- train(V58~LD1+LD2+LD3+LD4, data = train, method = 'nnet', 
                         trace=F, maxit=1000,
                         tuneGrid = expand.grid(.size=9,.decay=decays), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
trc <- trainControl (method="repeatedcv", number=5, repeats=1)
decays <- c(0, 0.01, 0.1, 1)
nn.model10x10CV <- train(V58~., data = train, method = 'nnet', 
                         trace=F, maxit=1000, MaxNWt=2000,
                         tuneGrid = expand.grid(.size=9,.decay=decays), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
decays <- c(1, 10, 100, 1000)
nn.model10x10CV <- train(V58~LD1+LD2+LD3+LD4, data = train, method = 'nnet', 
                         trace=F, maxit=1000,
                         tuneGrid = expand.grid(.size=9,.decay=0), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
nn <- nnet(V58~., data=train, maxit=1000, size=9, decay=1, MaxNWt=2000)
getvalue <- function(row){
  if (row[1]){
    0
  } else if (row[2]){
    1
  } else if (row[3]){
    2
  } else if (row[4]){
    3
  } else if (row[5]){
    4
  } else {
    5
  }
}
table(train$V58, apply(nn$fitted.values, 1, which.max)-1)
pred <- predict(nn, test)
(tab <- table(test$V58, apply(pred, 1, which.max)-1))
(err.test <- 1 - sum(diag(tab))/sum(tab))
trc <- trainControl (method="repeatedcv", number=10, repeats=1)
decays <- c(0.67, 0.66, 0.68)
sizes <- c(1, 2, 3, 4,5,6,7)
nn.model10x10CV <- train(V58~.-LD1-LD2-LD3-LD4-V56-V20-V21-V22, data = train,
                         method = 'nnet',
                         trace=F, maxit=1000, MaxNWts=10000,
                         tuneGrid = expand.grid(.size=20,.decay=decays),
                         trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
nn <- nnet(V58~.-LD1-LD2-LD3-LD4-V56-V20-V21-V22, data=train, maxit=1000, size=20, decay=0.68, MaxNWts=10000)
# weights:  685
initial  value 439.074092 
iter  10 value 181.772622
iter  20 value 163.301853
iter  30 value 160.316728
iter  40 value 159.762315
iter  50 value 159.510276
iter  60 value 159.406120
iter  70 value 159.392548
iter  80 value 159.385878
iter  90 value 159.382937
iter 100 value 159.382143
iter 110 value 159.381815
iter 120 value 159.381599
final  value 159.381515 
converged
(tab <- table(train$V58, apply(nn$fitted.values, 1, which.max)-1))
   
      0   1   2   3   4
  0 116   2   0   1   0
  1   9  14   3   2   0
  2   6   3   8   3   0
  3   4   3   3  11   0
  4   1   0   1   4   2
(err <- 1 - sum(diag(tab))/sum(tab))
[1] 0.2295918
pred <- predict(nn, test)
(tab <- table(test$V58, apply(pred, 1, which.max)-1))
   
     0  1  2  3
  0 63  3  2  1
  1  5  3  0  1
  2  1  2  0  3
  3  2  1  1  3
  4  0  3  2  2
(err <- 1 - sum(diag(tab))/sum(tab))
[1] 0.2959184
target <- as.numeric(test$V58)-1
target[target > 1] <- 1

pred <- apply(pred, 1, which.max)-1
pred[pred>1] <- 1

(tab <- table(target, pred))
      pred
target  0  1
     0 63  6
     1  8 21
(err <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1428571

Two-classes

aux <- hung
aux$V58[aux$V58 != 0] <- 1
aux$V58 <- droplevels(aux$V58)
set.seed(2000)
n <- nrow(aux)
train.lenght <- round(2*n/3)

aux <- aux[sample(n),]
train_aux <- aux[1:train.lenght,]
test_aux <- aux[(train.lenght+1):n,]
nn <- nnet(V58~.-V56-V20-V21-V22, data=train_aux, maxit=1000, size=30, decay=0, MaxNWts = 10000)
# weights:  901
initial  value 184.168596 
iter  10 value 39.613298
iter  20 value 0.489019
iter  30 value 0.008910
iter  40 value 0.001337
iter  50 value 0.000283
iter  60 value 0.000194
final  value 0.000089 
converged
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
   
      0   1
  0 119   0
  1   0  77
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0
decays <- c(0.52, 0.55, 0.57, 0.6)
sizes <- c(4,5, 6, 7)
trc <- trainControl (method="repeatedcv", number=10, repeats=1)
nn.model10x10CV <- train(V58~.-V56-V20-V21-V22, data = train_aux,
                         method = 'nnet',
                         trace=F, maxit=1000, MaxNWts=10000000,
                         tuneGrid = expand.grid(.size=30,.decay=decays), trControl=trc)
nn.model10x10CV$results
nn.model10x10CV$bestTune
nn <- nnet(V58~.-V56-V20-V21-V22, data=train_aux, maxit=1000, size=30, decay=0.55, MaxNWt=10000)
# weights:  901
initial  value 222.113045 
iter  10 value 81.643410
iter  20 value 72.298607
iter  30 value 71.251664
iter  40 value 70.796652
iter  50 value 70.483355
iter  60 value 70.415911
iter  70 value 70.401168
iter  80 value 70.388071
iter  90 value 70.383196
iter 100 value 70.382250
iter 110 value 70.382108
iter 120 value 70.382065
final  value 70.382063 
converged
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
   
      0   1
  0 110   9
  1  13  64
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1122449
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
   
      0   1
  0 110   9
  1  13  64
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1122449
pred <- predict(nn, test)
(tab<- table(test_aux$V58, (pred > 0.5)*1))
   
     0  1
  0 59 10
  1  7 22
(err.train <- 1 - sum(diag(tab))/sum(tab))
[1] 0.1734694
---
title: "R Notebook"
output: html_notebook
---

# Machine learning project
## Authors: Jose Pérez Cano & Álvaro Ribot Barrado

### 0. Libraries

```{r}
install.packages("klaR")
install.packages("TunePareto")
install.packages("rgl")
install.packages("glmnet")
install.packages("ca")
```

```{r}
# LDA/ QDA
library(MASS)

# RDA
library(klaR)

# Multinomial
library(nnet)

# Cross-Validation
library(TunePareto)

# Naive Bayes
library(e1071)

# k-NN
library(class)

# Correspondence analysis
library(ca)

# Cross-validation nn
library(caret)
```

### 1. Read data

```{r}
set.seed(2105)
setwd("../data")
clev <- read.csv("cleveland.csv", header=F)
head(clev)
```

### 2. Preprocess data

The dataset has missings that need to be treated, columns with too many should be removed and columns with a few should have them imputated.

```{r}
source("Preprocessing.R")

# Missings
clev <- clev[,much.na.cols(clev,60)]
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28", "location")
clev <- remove.var(clev, dummy)
clev <- knn.imputation(clev, 7)

# Multicollinearity
#corr.factors <- cor(clev)
#which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
clev <- remove.var(clev, c("V57", "V55"))

# Factors
factores <- c("V58", "V4", "V9", "V16", "V18", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V41", "V51", "V56", "V11", "V59", "V60", "V61", "V63", "V65", "V67", "V68")
for (f in factores){
  clev[,f] <- as.factor(clev[,f])
}
clev <- move.value(clev, "V25", 2, 1)
```

```{r}
summary(clev)
```

#### 2.1 Visualizations

```{r}
source("Visualizations.R")

histograms(clev)
boxplot.num(clev)
histograms(clev, F)
show.cor(clev)
```

#### 2.2 Modification of values

```{r}
qqplots(clev)
```

Boxcox

```{r}
boxcox.plots(clev)
```

Variables with many zeros:

```{r}
boxcox.plot.special(clev, c("V14", "V15", "V40"))
```

```{r}
clev <- apply.trans(clev, sqrt.neg.vars=c("V10", "V12", "V31", "V43"), sqrt.vars = c("V14", "V40"))
clev <- scale.num(clev)
```

This is the final dataset.

```{r}
summary(clev)
```

#### 2.3. Feature extraction

Separe train and test data, seed for reproducibility.

```{r}
set.seed(2000)
n <- nrow(clev)
train.lenght <- round(2*n/3)

clev <- clev[sample(n),]
train <- clev[1:train.lenght,]
test <- clev[(train.lenght+1):n,]

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"
```

```{r}
pca <- pca.num(train)
plot.pca(train, col.class, pca = pca)
```

```{r}
fda <- plot.fda(train, V58~.-V21-V22-V59, col.class)

train <- extract.fda(fda, train)
test <- extract.fda(fda, test)
```

Correspondence analysis

```{r}
mca.features <- mcaplot(train, factores)
```

### 3. Resampling protocol

```{r}
source("Resampling.R")
```


### 4. Models

The models we are going to use are: 
  - LDA
  - QDA
  - RDA
  - k-NN
  - Naïve Bayes
  - GLM
  - Neural Networks
  

```{r}
rda.model <- rda(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
naive.model <- naiveBayes(V58~V3+V4+V9+V10+V11+V12+V14+V15+V16+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V29+V31+V32+V33+V34+V35+V37+V38+V39+V40+V41+V43+V44+V51+V56+V60+V61+V63+V65+V67+V68, data=train)
```


```{r}
cross.validation(train, train$V58, rda.model, 10, 10, T)
```


```{r}
rda.model.fda <- rda(V58~.,data=train)
```


```{r}
cross.validation(train, train$V58, rda.model.fda, 10, 10, T)
```


```{r}
cross.validation.naive(train, train$V58, naive.model, 10, 10)
```


```{r}
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
```


```{r}
plot(err, type = "l")
err
```


```{r}
cross.validation.knn(train, train$V58, 10, 10, 1)

multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)
```


### Test error

```{r}
rda.model <- update(rda.model.fda, data=train)
pred.test <- predict(rda.model.fda, test)
pred.test <- pred.test$class
(err.table <- table(True=test$V58, Pred=pred.test))
(err.test <- 1-sum(diag(err.table))/sum(err.table))
```
 

## Hungarian data

```{r}
hung <- read.csv("../data/hungarian.csv", header=F)
head(hung)
```


```{r}
# Missings
hung <- hung[,much.na.cols(hung,60)]
dummy <- c("V1", "V2", "V36", "V69", "V70", "V71", "V72", "V73", "V28")
hung <- remove.var(hung, dummy)
hung <- knn.imputation(hung, 7)

# Multicollinearity
corr.factors <- cor(hung)
which(abs(corr.factors)-diag(diag(corr.factors))>0.9, arr.ind=T)
hung <- remove.var(hung, c("V57", "V55"))

# Factors
factores <- c("V58", "V4", "V9", "V16", "V19", "V20", "V21", "V22", "V23", "V24", "V25", "V26", "V27", "V38", "V39", "V56", "V11")
for (f in factores){
  hung[,f] <- as.factor(hung[,f])
}
hung <- move.value(hung, "V25", 2, 1)
```

```{r}
summary(hung)
```

```{r}
histograms(hung)
boxplot.num(hung)
histograms(hung, F)
show.cor(hung)
```

```{r}
qqplots(hung)
```

```{r}
boxcox.plots(hung)
```


```{r}
boxcox.plot.special(hung, c("V40"))
```

```{r}
hung <- apply.trans(hung, sqrt.neg.vars=c("V10", "V12"), sqrt.vars = c("V31", "V29", "V42", "V43"), log.vars = c("V6", "V7", "V40"))
hung <- scale.num(hung)
```

```{r}
hung <- remove.var(hung, c("V23", "V39"))
hung <- move.value(hung, "V19", 2, 1)
summary(hung)
```


```{r}
set.seed(2000)
n <- nrow(hung)
train.lenght <- round(2*n/3)

hung <- hung[sample(n),]
train <- hung[1:train.lenght,]
test <- hung[(train.lenght+1):n,]

col.class <- as.numeric(train$V58)
col.class[col.class==1] <- "red"
col.class[col.class==2] <- "green"
col.class[col.class==3] <- "blue"
col.class[col.class==4] <- "yellow"
col.class[col.class==5] <- "purple"
```

```{r}
col.class2 <- as.numeric(train$V58)
col.class2[col.class2==1] <- "red"
col.class2[col.class2==2] <- "green"
col.class2[col.class2==3] <- "blue"
col.class2[col.class2==4] <- "yellow"
col.class2[col.class2==5] <- "purple"
pca <- pca.num(hung)
plot.pca(hung, col.class2, pca = pca)
# hung <- extract.pca(pca, hung)
```

```{r}
fda <- plot.fda(train, V58~.-V56, col.class)

train <- extract.fda(fda, train)
test <- extract.fda(fda, test)
```

```{r}
plot(test$LD1, test$LD2, col=c("red","green","yellow","blue","purple")[as.numeric(test$V58)], xlab="LD1", ylab="LD2")
legend("topleft", legend=c("0","1","2","3","4"), fill=c("red","green","yellow","blue","purple"))
```


```{r}
mca.features <- mcaplot(train, factores)
```

```{r}
rda.model <- rda(V58~.-LD1-LD2-LD3-LD4, data=train)
naive.model <- naiveBayes(V58~., data=train)
```


```{r}
cross.validation(train, train$V58, rda.model, 10, 10, T)
```


```{r}
rda.model.fda <- rda(V58~.,data=train)
```


```{r}
cross.validation(train, train$V58, rda.model.fda, 2, 10, T)
```


```{r}
cross.validation.naive(train, train$V58, naive.model, 10, 10)
```


```{r}
err <- c()
for (k in 1:20){
  err <- c(err, cross.validation.knn(train, train$V58, 10,10, k))
}
```


```{r}
plot(err, type = "l")
err
```


```{r}
multinomial.model <- multinom(V58~., data=train)

cross.validation(train, train$V58, multinomial.model, 10, 10, F)

multinomial.model.step <- step(multinomial.model)

cross.validation(train, train$V58, multinomial.model.step, 10, 10, F)

multinomial.model.noFDA <- multinom(V58~.-LD1-LD2-LD3-LD4, data=train)

cross.validation(train, train$V58, multinomial.model.noFDA, 10, 10, F)

multinomial.model.noFDA.step <- step(multinomial.model.noFDA)

cross.validation(train, train$V58, multinomial.model.noFDA.step, 10, 10, F)
```


```{r}
qda.model <- qda(V58~.-LD1-LD2-LD3-LD4, data=train)
```


## Redes neuronales

```{r}
trc <- trainControl (method="repeatedcv", number=10, repeats=10)
```


```{r}
decays <- c(0.0001, 0.001, 0.01, 0.1, 1)
nn.model10x10CV <- train(V58~LD1+LD2+LD3+LD4, data = train, method = 'nnet', 
                         trace=F, maxit=1000,
                         tuneGrid = expand.grid(.size=9,.decay=decays), trControl=trc)
```


```{r}
nn.model10x10CV$results
nn.model10x10CV$bestTune
```

```{r}
trc <- trainControl (method="repeatedcv", number=5, repeats=1)
decays <- c(0, 0.01, 0.1, 1)
nn.model10x10CV <- train(V58~., data = train, method = 'nnet', 
                         trace=F, maxit=1000, MaxNWt=2000,
                         tuneGrid = expand.grid(.size=9,.decay=decays), trControl=trc)
```


```{r}
nn.model10x10CV$results
nn.model10x10CV$bestTune
```

```{r}
decays <- c(1, 10, 100, 1000)
nn.model10x10CV <- train(V58~LD1+LD2+LD3+LD4, data = train, method = 'nnet', 
                         trace=F, maxit=1000,
                         tuneGrid = expand.grid(.size=9,.decay=0), trControl=trc)
```


```{r}
nn.model10x10CV$results
nn.model10x10CV$bestTune
```

```{r}
nn <- nnet(V58~., data=train, maxit=1000, size=9, decay=1, MaxNWt=2000)
```

```{r}
getvalue <- function(row){
  if (row[1]){
    0
  } else if (row[2]){
    1
  } else if (row[3]){
    2
  } else if (row[4]){
    3
  } else if (row[5]){
    4
  } else {
    5
  }
}
```

```{r}
table(train$V58, apply(nn$fitted.values, 1, which.max)-1)
```

```{r}
pred <- predict(nn, test)
(tab <- table(test$V58, apply(pred, 1, which.max)-1))
(err.test <- 1 - sum(diag(tab))/sum(tab))
```


```{r}
trc <- trainControl (method="repeatedcv", number=10, repeats=1)
decays <- c(0.67, 0.66, 0.68)
sizes <- c(1, 2, 3, 4,5,6,7)
nn.model10x10CV <- train(V58~.-LD1-LD2-LD3-LD4-V56-V20-V21-V22, data = train,
                         method = 'nnet',
                         trace=F, maxit=1000, MaxNWts=10000,
                         tuneGrid = expand.grid(.size=20,.decay=decays),
                         trControl=trc)
```


```{r}
nn.model10x10CV$results
nn.model10x10CV$bestTune
```

```{r}
nn <- nnet(V58~.-LD1-LD2-LD3-LD4-V56-V20-V21-V22, data=train, maxit=1000, size=20, decay=0.68, MaxNWts=10000)
(tab <- table(train$V58, apply(nn$fitted.values, 1, which.max)-1))
(err <- 1 - sum(diag(tab))/sum(tab))
```


```{r}
pred <- predict(nn, test)
(tab <- table(test$V58, apply(pred, 1, which.max)-1))
(err <- 1 - sum(diag(tab))/sum(tab))

target <- as.numeric(test$V58)-1
target[target > 1] <- 1

pred <- apply(pred, 1, which.max)-1
pred[pred>1] <- 1

(tab <- table(target, pred))
(err <- 1 - sum(diag(tab))/sum(tab))
```

## Two-classes


```{r}
aux <- hung
aux$V58[aux$V58 != 0] <- 1
aux$V58 <- droplevels(aux$V58)
```

```{r}
set.seed(2000)
n <- nrow(aux)
train.lenght <- round(2*n/3)

aux <- aux[sample(n),]
train_aux <- aux[1:train.lenght,]
test_aux <- aux[(train.lenght+1):n,]
```

```{r}
nn <- nnet(V58~.-V56-V20-V21-V22, data=train_aux, maxit=1000, size=30, decay=0, MaxNWts = 10000)
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
(err.train <- 1 - sum(diag(tab))/sum(tab))
```

```{r}
decays <- c(0.52, 0.55, 0.57, 0.6)
sizes <- c(4,5, 6, 7)
trc <- trainControl (method="repeatedcv", number=10, repeats=1)
nn.model10x10CV <- train(V58~.-V56-V20-V21-V22, data = train_aux,
                         method = 'nnet',
                         trace=F, maxit=1000, MaxNWts=10000000,
                         tuneGrid = expand.grid(.size=30,.decay=decays), trControl=trc)
```


```{r}
nn.model10x10CV$results
nn.model10x10CV$bestTune
```

```{r}
nn <- nnet(V58~.-V56-V20-V21-V22, data=train_aux, maxit=1000, size=30, decay=0.55, MaxNWt=10000)

(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
(err.train <- 1 - sum(diag(tab))/sum(tab))
```


```{r}
(tab<- table(train_aux$V58, (nn$fitted.values > 0.5)*1))
(err.train <- 1 - sum(diag(tab))/sum(tab))
```

```{r}
pred <- predict(nn, test)
(tab<- table(test_aux$V58, (pred > 0.5)*1))
(err.train <- 1 - sum(diag(tab))/sum(tab))
```

